# Loading Libraries 

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.1.0     v dplyr   1.0.5
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'dplyr' was built under R version 4.0.5
## Warning: package 'stringr' was built under R version 4.0.5
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(magrittr)
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
library(modelr)
library(nycflights13)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(splines)
library(forcats)
library(plotly)
## Warning: package 'plotly' was built under R version 4.0.5
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout

Introduction and Description

The data is provided by data.gov.uk. It describes road accidents across Leeds in the year 2017. Between all datasets, this dataset was selected because it includes around 2,200 accident with 15 variable that can be easily parsed and explored. Also, we selected this dataset because we believe that there are different questions that can be asked about the data, and that the data can provide interesting answers for our questions. The dataset includes data such as location, date, time, road setting and casualty information for each accident that transpired during the year.

Reference : https://data.world/datagov-uk/6efe5505-941f-45bf-b576-4c1e09b579a1

Data Parsing and Import

# Loading & Parsing Data

Accidents_2017 <- read_csv("Data/datagov-uk-6efe5505-941f-45bf-b576-4c1e09b579a1/2017-8.csv", 
                           col_names = c("Reference_Number","Easting","Northing","Vehicles_Num","Accident_Date","Time","Road_Class","Road_Surface",
                                         "Lightning_Cond","Weather_Cond","Vehicle_Type","Casualty_Class","Severity","Gender","Age"), skip = 1)
## 
## -- Column specification --------------------------------------------------------
## cols(
##   Reference_Number = col_character(),
##   Easting = col_double(),
##   Northing = col_double(),
##   Vehicles_Num = col_double(),
##   Accident_Date = col_character(),
##   Time = col_character(),
##   Road_Class = col_character(),
##   Road_Surface = col_character(),
##   Lightning_Cond = col_character(),
##   Weather_Cond = col_character(),
##   Vehicle_Type = col_character(),
##   Casualty_Class = col_character(),
##   Severity = col_character(),
##   Gender = col_character(),
##   Age = col_double()
## )
Accidents_Date <- Accidents_2017$Accident_Date
Accidents_Date <- parse_date(Accidents_Date, "%m/%d/%Y")

Time <- Accidents_2017$Time
Time <- parse_time(Time, "%H%M")

Accidents_2017 <- Accidents_2017 %>%
  select(1:4, 7:15) %>%
  mutate(Accidents_Date, Time) 

Accidents_2017 <- Accidents_2017[,c(1:4,14,15,5:13)]

# Correcting spelling mistakes

Lightning_Cond <- 
  str_replace(Accidents_2017$Lightning_Cond,"Darkness: Street lights present and lit and lit","Darkness: Street lights present and lit")


Road_Class <- str_replace_all(Accidents_2017$Road_Class,
      c("A.*" = "A","B.*" = "B","M.*" = "M"))


Road_Surface <- str_replace(Accidents_2017$Road_Surface,"^F.*","Snow")


Weather_Cond <- word(Accidents_2017$Weather_Cond,1) 
Weather_Cond <- str_replace(Weather_Cond,"Fog","Other")


Vehicle_Type <- word(Accidents_2017$Vehicle_Type,1)
Vehicle_Type <- str_replace_all(Vehicle_Type,
      c(".Private" = "Taxi",
        "Ca.*" = "Car","Pedal" = "Cycle"))
Vehicle_Type <- str_replace(Vehicle_Type,"TaxiTaxi", "Taxi")


# Adding the improved columns to the dataset 

Accidents_2017 <- Accidents_2017 %>% 
  select(1:6,12:15) %>%
  mutate(Lightning_Cond,Weather_Cond,Road_Class,Road_Surface,Vehicle_Type,
         Year = year(Accidents_Date),
         Month = month(Accidents_Date),
         Day = day(Accidents_Date),
         Hour = hour(Time),
         Minute = minute(Time),
         Accidents_DateTime = make_datetime(Year, Month, Day, Hour, Minute))

Accidents_2017 <- Accidents_2017[,c(1:4, 16:20, 5:6, 21, 7:15)]

# Assigning the columns to their proper classes. 

for(i in 2:length(Accidents_2017)) {
    if(is.character(Accidents_2017[[i]])) {
        Accidents_2017[[i]] <- as.factor(Accidents_2017[[i]])
    } else if (is.numeric(Accidents_2017[[i]])) {
        Accidents_2017[[i]] <- as.numeric(Accidents_2017[[i]])
    }
}

Accidents_2017
## # A tibble: 2,203 x 21
##    Reference_Number Easting Northing Vehicles_Num  Year Month   Day  Hour Minute
##    <chr>              <dbl>    <dbl>        <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>
##  1 3AP0313           426340   428455            1  2017     3    17     8     15
##  2 3BE0850           430828   433222            2  2017     1    14    13     30
##  3 4110858           428940   429856            2  2017     1     1     8      5
##  4 4110858           428940   429856            2  2017     1     1     8      5
##  5 4111495           429899   434277            2  2017     1     1    17      5
##  6 4111706           435946   436807            2  2017     1     1    12      0
##  7 4120471           443658   436768            3  2017     1     2    12     30
##  8 4120471           443658   436768            3  2017     1     2    12     30
##  9 4121054           442103   434572            2  2017     1     2    18      7
## 10 4121054           442103   434572            2  2017     1     2    18      7
## # ... with 2,193 more rows, and 12 more variables: Accidents_Date <date>,
## #   Time <time>, Accidents_DateTime <dttm>, Casualty_Class <fct>,
## #   Severity <fct>, Gender <fct>, Age <dbl>, Lightning_Cond <fct>,
## #   Weather_Cond <fct>, Road_Class <fct>, Road_Surface <fct>,
## #   Vehicle_Type <fct>
# Checking NAs 

sum(rowSums(is.na(Accidents_2017)))
## [1] 0
# No NA values to explore

Exploring Data Analysis

EDA1 <- Accidents_2017 %>%
  group_by(Gender, Age) %>%
  summarise(Count = n() ) %>%
  ungroup() %>%
  ggplot(mapping = aes(x=Age , y = Count)) +
  geom_point(alpha = 0.6) + 
  facet_wrap(~Gender) +
  ggtitle("Number of Casualties Per Age For Males and Females") +
  ylab("Number of Casualties") +
  theme_linedraw()
## `summarise()` has grouped output by 'Gender'. You can override using the `.groups` argument.
layout_plot <- function(my_plot, x = -0.055, y = - 0.03){
  my_plot[['x']][['layout']][['annotations']][[1]][['y']] <- x
  my_plot[['x']][['layout']][['annotations']][[2]][['x']] <- y
  my_plot
}

ggplotly(EDA1) %>% layout_plot

The plot shows the relationship between the Number of Casualities and their Age. In addition, the plot is classified by the Gender, therefore; it shows how the relationship differs between males and females.

For females, as the age increases the count increases. The count peaks around the mid-20s, then the count declines gradually. The males’ graph has a similar shape as the females’. The only difference is that the males’ graph has a higher peak and the drop is sharper.

Accidents_2017 %>%
  group_by(Gender, Age) %>%
  summarise(Count = n() )
## `summarise()` has grouped output by 'Gender'. You can override using the `.groups` argument.
## # A tibble: 182 x 3
## # Groups:   Gender [2]
##    Gender   Age Count
##    <fct>  <dbl> <int>
##  1 Female     1     8
##  2 Female     2     3
##  3 Female     3     6
##  4 Female     4     2
##  5 Female     5     6
##  6 Female     6     7
##  7 Female     7     8
##  8 Female     8     5
##  9 Female     9     4
## 10 Female    10     9
## # ... with 172 more rows
(Accidents_Hour <- Accidents_2017 %>% count(Hour))
## # A tibble: 24 x 2
##     Hour     n
##    <dbl> <int>
##  1     0    30
##  2     1    23
##  3     2     9
##  4     3    15
##  5     4    16
##  6     5    12
##  7     6    41
##  8     7    85
##  9     8   143
## 10     9   105
## # ... with 14 more rows
(DoW_Hour <- Accidents_2017 %>% 
  mutate(day_of_week = wday(Accidents_Date, label = TRUE))%>%
  count(day_of_week, Hour))
## # A tibble: 158 x 3
##    day_of_week  Hour     n
##    <ord>       <dbl> <int>
##  1 Sun             0    14
##  2 Sun             1     6
##  3 Sun             2     2
##  4 Sun             3     2
##  5 Sun             4     6
##  6 Sun             6     5
##  7 Sun             7     1
##  8 Sun             8     8
##  9 Sun             9     6
## 10 Sun            10     5
## # ... with 148 more rows
ggplot(Accidents_Hour, mapping = aes(Hour, n)) +
  geom_line() +
  geom_point()

ggplot(DoW_Hour, mapping = aes(Hour, n, color = day_of_week))+
  geom_line()+
  geom_point()

ggplot(Accidents_Hour, mapping = aes(Hour,n))+
  geom_line()+
  geom_point()+
  geom_line(data = DoW_Hour, mapping = aes(Hour,n,color = day_of_week))

# Distribution of Accidents across the course of the day for every day of the year

Accidents_2017 %>%
  mutate(Accidents_by_Hour = update(Accidents_DateTime, yday = 1)) %>%
  ggplot(mapping = aes(Accidents_by_Hour)) +
  geom_freqpoly(binwidth = 3600) # 5 mins interval 

# Distribution of Accidents across the course of the day for every day (and month) of the year

Accidents_2017 %>%
  mutate(Accidents_by_Hour = update(Accidents_DateTime, yday = 1),
         Accidents_by_Month = factor(Month)) %>%
  ggplot(mapping = aes(Accidents_by_Hour, color = Accidents_by_Month)) +
  geom_freqpoly(binwidth = 3600) # 1 hour interval

# Distribution of Accidents across the course of the day for every day (and month) of the year w/ y-axis as density

Accidents_2017 %>%
  mutate(Accidents_by_Hour = update(Accidents_DateTime, yday = 1),
         Accidents_by_Month = factor(Month)) %>%
  ggplot(mapping = aes(Accidents_by_Hour, color = Accidents_by_Month)) +
  geom_freqpoly(mapping = aes(y = ..density..), binwidth = 3600) # 1 hour interval

daily <- Accidents_2017 %>%
  group_by(Accidents_Date) %>%
  summarise(n = n()) %>%
  mutate(wday = wday(Accidents_Date, label = T, abbr = F))

daily
## # A tibble: 360 x 3
##    Accidents_Date     n wday     
##    <date>         <int> <ord>    
##  1 2017-01-01         4 Sunday   
##  2 2017-01-02         7 Monday   
##  3 2017-01-03         3 Tuesday  
##  4 2017-01-04         7 Wednesday
##  5 2017-01-05         4 Thursday 
##  6 2017-01-06         5 Friday   
##  7 2017-01-07         5 Saturday 
##  8 2017-01-09         5 Monday   
##  9 2017-01-10         9 Tuesday  
## 10 2017-01-11         7 Wednesday
## # ... with 350 more rows
# Number of flights per day

ggplot(daily, mapping = aes(Accidents_Date, n)) +
  geom_line()

# Number of flights in each day of the week
ggplot(daily, mapping = aes(wday, n)) + 
  geom_boxplot()

ggplot(daily, mapping = aes(wday, n)) + 
  geom_bar(stat = "identity")

# Fitting a linear model w/ predictions & residuals

model1 <- lm(n ~ wday, data = daily)
grid <- daily %>%
  data_grid(wday) %>%
  add_predictions(model1, "n")
ggplot(daily, mapping = aes(wday, n))+
  geom_boxplot() +
  geom_point(data = grid, colour = "red", size = 4)

daily <- daily %>%
  add_residuals(model1)
ggplot(data = daily, mapping = aes(Accidents_Date, resid)) +
  geom_ref_line(h = 0) +
  geom_line()